library(tidyverse)
library(ggplot2)
library(scales)
library(ggthemes)
library(gridExtra)
The diamonds data set is available with the ggplot2 package. It contains approximately 54K observations and 10 variables, which include carat, cut, color, clarity, depth, table, price, x (length in mm), y (width in mm), and z (depth in mm). It is a clean data set, with no missing values.
summary(diamonds)
## carat cut color clarity depth
## Min. :0.2000 Fair : 1610 D: 6775 SI1 :13065 Min. :43.00
## 1st Qu.:0.4000 Good : 4906 E: 9797 VS2 :12258 1st Qu.:61.00
## Median :0.7000 Very Good:12082 F: 9542 SI2 : 9194 Median :61.80
## Mean :0.7979 Premium :13791 G:11292 VS1 : 8171 Mean :61.75
## 3rd Qu.:1.0400 Ideal :21551 H: 8304 VVS2 : 5066 3rd Qu.:62.50
## Max. :5.0100 I: 5422 VVS1 : 3655 Max. :79.00
## J: 2808 (Other): 2531
## table price x y
## Min. :43.00 Min. : 326 Min. : 0.000 Min. : 0.000
## 1st Qu.:56.00 1st Qu.: 950 1st Qu.: 4.710 1st Qu.: 4.720
## Median :57.00 Median : 2401 Median : 5.700 Median : 5.710
## Mean :57.46 Mean : 3933 Mean : 5.731 Mean : 5.735
## 3rd Qu.:59.00 3rd Qu.: 5324 3rd Qu.: 6.540 3rd Qu.: 6.540
## Max. :95.00 Max. :18823 Max. :10.740 Max. :58.900
##
## z
## Min. : 0.000
## 1st Qu.: 2.910
## Median : 3.530
## Mean : 3.539
## 3rd Qu.: 4.040
## Max. :31.800
##
# First 10 lines
head(diamonds, 10)
## # A tibble: 10 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
## 7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47
## 8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53
## 9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49
## 10 0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39
diamonds %>%
group_by(cut) %>%
summarise(max_price = max(price),
min_price = min(price),
mean_price = mean(price),
median_price = median(price),
count = n())
## # A tibble: 5 × 6
## cut max_price min_price mean_price median_price count
## <ord> <int> <int> <dbl> <dbl> <int>
## 1 Fair 18574 337 4359. 3282 1610
## 2 Good 18788 327 3929. 3050. 4906
## 3 Very Good 18818 336 3982. 2648 12082
## 4 Premium 18823 326 4584. 3185 13791
## 5 Ideal 18806 326 3458. 1810 21551
diamonds %>%
group_by(color) %>%
summarise(max_price = max(price),
min_price = min(price),
mean_price = mean(price),
median_price = median(price),
count = n())
## # A tibble: 7 × 6
## color max_price min_price mean_price median_price count
## <ord> <int> <int> <dbl> <dbl> <int>
## 1 D 18693 357 3170. 1838 6775
## 2 E 18731 326 3077. 1739 9797
## 3 F 18791 342 3725. 2344. 9542
## 4 G 18818 354 3999. 2242 11292
## 5 H 18803 337 4487. 3460 8304
## 6 I 18823 334 5092. 3730 5422
## 7 J 18710 335 5324. 4234 2808
diamonds %>%
group_by(clarity) %>%
summarise(max_price = max(price),
min_price = min(price),
mean_price = mean(price),
median_price = median(price),
count = n())
## # A tibble: 8 × 6
## clarity max_price min_price mean_price median_price count
## <ord> <int> <int> <dbl> <dbl> <int>
## 1 I1 18531 345 3924. 3344 741
## 2 SI2 18804 326 5063. 4072 9194
## 3 SI1 18818 326 3996. 2822 13065
## 4 VS2 18823 334 3925. 2054 12258
## 5 VS1 18795 327 3839. 2005 8171
## 6 VVS2 18768 336 3284. 1311 5066
## 7 VVS1 18777 336 2523. 1093 3655
## 8 IF 18806 369 2865. 1080 1790
ggplot(diamonds, aes(x = price)) +
geom_histogram(color = 'black', fill = 'SteelBlue', binwidth = 500) +
scale_x_continuous(labels = dollar, breaks = seq(0, 20000, 1000)) +
labs(title = 'Distribution of diamonds price',
x = 'Price', y = 'Count') +
theme_fivethirtyeight() +
theme(axis.title = element_text(), axis.text.x = element_text(angle=90))
ggplot(diamonds, aes(x = price)) +
geom_histogram(color = 'black', fill = 'SteelBlue', binwidth = 50) +
scale_x_continuous(labels = dollar, breaks = seq(0, 2000, 100)) +
labs(title = 'Price distribution of diamonds - 0-2000$',
x = 'Price', y = 'Count') +
coord_cartesian(c(0, 2000)) +
theme_fivethirtyeight() +
theme(axis.title = element_text(), axis.text.x = element_text(angle=90))
ggplot(diamonds, aes(x=price)) +
geom_histogram(color='black', fill='SteelBlue', binwidth = 30) +
scale_x_continuous(labels=dollar, breaks=seq(0, 5000, 200)) +
labs(title='Price distribution by cut quality - 0-5000$',
x = 'Price', y = 'Count') +
theme_fivethirtyeight() +
theme(axis.title = element_text(), axis.text.x = element_text(angle=90)) +
coord_cartesian(c(0, 5000)) +
facet_grid(cut~.)
qplot(x = price, data=diamonds) +
facet_wrap(~cut, scales = 'free') +
labs(x = 'Price', y = 'Count')
cut_price <- data.frame(diamonds$cut, diamonds$price)
fair <- cut_price %>%
filter(cut_price$diamonds.cut == 'Fair')
summary(fair$diamonds.price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 337 2050 3282 4359 5206 18574
good <- cut_price %>%
filter(cut_price$diamonds.cut == 'Good')
summary(good$diamonds.price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 327 1145 3050 3929 5028 18788
v_good <- cut_price %>%
filter(cut_price$diamonds.cut == 'Very Good')
summary(v_good$diamonds.price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 336 912 2648 3982 5373 18818
premium <- cut_price %>%
filter(cut_price$diamonds.cut == 'Premium')
summary(premium$diamonds.price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 326 1046 3185 4584 6296 18823
ideal <- cut_price %>%
filter(cut_price$diamonds.cut == 'Ideal')
summary(ideal$diamonds.price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 326 878 1810 3458 4678 18806
ggplot(diamonds, aes(x=price/carat)) +
geom_histogram(color='black', fill='SteelBlue', binwidth=0.05) +
labs(title='Price/Carat by Cut quality',
x = 'Price/Carat', y = 'Count') +
scale_x_log10(breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x))) +
theme_fivethirtyeight() +
theme(axis.title = element_text(), axis.text.x = element_text()) +
facet_grid(cut~., scale = "free")
ggplot(diamonds, aes(x=clarity, y=price, fill=cut)) +
geom_boxplot() +
labs(title='Price vs Clarity',
subtitle='per cut',
x = 'Clarity', y = 'Price') +
theme_fivethirtyeight() +
theme(axis.title = element_text(), axis.text.x = element_text())
ggplot(diamonds, aes(x=clarity, y=price, fill=cut)) +
geom_boxplot() +
labs(x = 'Clarity', y = 'Price',
title='Distribution of price',
subtitle='across cut, clarity and color') +
theme_fivethirtyeight() +
theme(axis.title = element_text(), axis.text.x = element_text()) +
facet_grid(color~.)
ggplot(diamonds, aes(x=color, y=price/carat, fill=color)) +
geom_boxplot() +
labs(title='Distribution of price',
subsitle='across colors',
x='Color',
y='Price per Carat') +
coord_cartesian(ylim=c(1000, 6000)) +
scale_y_continuous(labels=dollar)
ggplot(diamonds, aes(x=carat, y=price)) +
geom_point(alpha=.05) +
labs(title='Price vs Carat',
subtitle='Omitted top 1%') +
scale_x_continuous(limits=c(0, quantile(diamonds$carat, 0.99))) +
scale_y_continuous(breaks=seq(0, 18000, 2000),
limits=c(0, quantile(diamonds$price, 0.99)),
labels=dollar) +
theme_fivethirtyeight() +
theme(axis.title = element_text(), axis.text.x = element_text()) +
theme(plot.subtitle = element_text(size=10))
## Warning: Removed 926 rows containing missing values (geom_point).
Takeaways
diamonds %>%
group_by(color) %>%
filter(color == 'D') %>%
summarise(quartile_25 = quantile(price, 0.25),
quartile_75 = quantile(price, 0.75),
IQR = quartile_75 - quartile_25)
## # A tibble: 1 × 4
## color quartile_25 quartile_75 IQR
## <ord> <dbl> <dbl> <dbl>
## 1 D 911 4214. 3302.
diamonds %>%
group_by(color) %>%
filter(color == 'J') %>%
summarise(quartile_25 = quantile(price, 0.25),
quartile_75 = quantile(price, 0.75),
IQR = quartile_75 - quartile_25)
## # A tibble: 1 × 4
## color quartile_25 quartile_75 IQR
## <ord> <dbl> <dbl> <dbl>
## 1 J 1860. 7695 5834.
diamonds %>%
group_by(clarity) %>%
filter(clarity == 'I1') %>%
summarise(quartile_25 = quantile(price, 0.25),
quartile_75 = quantile(price, 0.75),
IQR = quartile_75 - quartile_25)
## # A tibble: 1 × 4
## clarity quartile_25 quartile_75 IQR
## <ord> <dbl> <dbl> <dbl>
## 1 I1 2080 5161 3081
diamonds %>%
group_by(clarity) %>%
filter(clarity == 'IF') %>%
summarise(quartile_25 = quantile(price, 0.25),
quartile_75 = quantile(price, 0.75),
IQR = quartile_75 - quartile_25)
## # A tibble: 1 × 4
## clarity quartile_25 quartile_75 IQR
## <ord> <dbl> <dbl> <dbl>
## 1 IF 895 2388. 1494.
diamonds %>%
group_by(cut) %>%
filter(cut == 'Fair') %>%
summarise(quartile_25 = quantile(price, 0.25),
quartile_75 = quantile(price, 0.75),
IQR = quartile_75 - quartile_25)
## # A tibble: 1 × 4
## cut quartile_25 quartile_75 IQR
## <ord> <dbl> <dbl> <dbl>
## 1 Fair 2050. 5206. 3155.
diamonds %>%
group_by(cut) %>%
filter(cut == 'Ideal') %>%
summarise(quartile_25 = quantile(price, 0.25),
quartile_75 = quantile(price, 0.75),
IQR = quartile_75 - quartile_25)
## # A tibble: 1 × 4
## cut quartile_25 quartile_75 IQR
## <ord> <dbl> <dbl> <dbl>
## 1 Ideal 878 4678. 3800.
ggplot(diamonds, aes(x=carat)) +
geom_freqpoly(binwidth=0.1) +
scale_x_continuous() +
scale_y_continuous() +
labs(title='Frequency of diamonds weight',
subsitle='across colors',
x='Carat',
y='Count') +
theme_fivethirtyeight() +
theme(axis.title = element_text(), axis.text.x = element_text())
ggplot(diamonds, aes(x=carat)) +
geom_freqpoly(binwidth=0.05) +
scale_x_continuous(breaks=c(0.1, 0.5, 0.8, 1.2, 1.5, 2.0, 3.0, 5.0), expand = c(0,0)) +
scale_y_continuous(expand=c(0,0)) +
geom_vline(xintercept=c(0.2, 0.4, 0.7, 0.79, 1.04, 5.01), color='#7daa8e', linetype='longdash') +
labs(title='Frequency of diamonds weight',
subtitle='Green vlines - summary(diamonds$carat)',
x='Carat',
y='Count') +
theme_fivethirtyeight() +
theme(axis.title = element_text(), axis.text.x = element_text()) +
theme(plot.subtitle = element_text(size=10))
summary(diamonds$carat)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.2000 0.4000 0.7000 0.7979 1.0400 5.0100
ggplot(diamonds, aes(x=x, y=price)) +
geom_point(alpha=.05) +
coord_cartesian(xlim=c(3.5, 11))+
labs(title='Price vs X') +
scale_y_continuous(breaks=seq(0, 20000, 2000), label=dollar)
ggplot(diamonds, aes(x=y, y=price)) +
geom_point(alpha=.05) +
coord_cartesian(xlim=c(3.5, 11))+
labs(title='Price vs Y') +
scale_y_continuous(breaks=seq(0, 20000, 2000), label=dollar)
ggplot(diamonds, aes(x=z, y=price)) +
geom_point(alpha=.05) +
coord_cartesian(xlim=c(1, 7))+
labs(title='Price vs Z') +
scale_y_continuous(breaks=seq(0, 20000, 2000), label=dollar)
# For the p-value, a typical threshold is 0.05, anything smaller counts as statistically significant
# For price vs X
with(diamonds, cor.test(price, x))
##
## Pearson's product-moment correlation
##
## data: price and x
## t = 440.16, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8825835 0.8862594
## sample estimates:
## cor
## 0.8844352
# For price vs Y
with(diamonds, cor.test(price, y))
##
## Pearson's product-moment correlation
##
## data: price and y
## t = 401.14, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8632867 0.8675241
## sample estimates:
## cor
## 0.8654209
# For price vs Z
with(diamonds, cor.test(price, z))
##
## Pearson's product-moment correlation
##
## data: price and z
## t = 393.6, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8590541 0.8634131
## sample estimates:
## cor
## 0.8612494
ggplot(diamonds, aes(x=depth, y=price)) +
geom_point(alpha=.05) +
coord_cartesian(xlim=c(40, 80))+
labs(title='Price vs Depth') +
scale_y_continuous(breaks=seq(0, 20000, 2000), label=dollar)
ggplot(diamonds, aes(x=depth, y=price)) +
labs(title='Price vs Depth') +
geom_point(alpha=1/100) +
scale_x_continuous(breaks=seq(min(diamonds$depth), max(diamonds$depth), 2), labels=seq(min(diamonds$depth), max(diamonds$depth), 2))
with(diamonds, cor.test(price, depth))
##
## Pearson's product-moment correlation
##
## data: price and depth
## t = -2.473, df = 53938, p-value = 0.0134
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.019084756 -0.002208537
## sample estimates:
## cor
## -0.0106474
diamonds_v <- diamonds %>%
mutate(volume=x*y*z)
diamonds_v2 <- diamonds_v %>%
filter(volume != 0,
volume <= 800)
ggplot(diamonds_v, aes(x=volume, y=price)) +
geom_point() +
labs(title='Price vs Volume') +
theme_fivethirtyeight() +
theme(axis.title = element_text())
with(subset(diamonds_v, !(volume == 0 | volume >= 800)), cor.test(price, volume))
##
## Pearson's product-moment correlation
##
## data: price and volume
## t = 559.19, df = 53915, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9222944 0.9247772
## sample estimates:
## cor
## 0.9235455
ggplot(diamonds_v2, aes(x=volume, y=price)) +
geom_point(alpha=.05) +
geom_smooth(method='lm') +
labs(title='Price vs Volume',
subtitle='Omitted top 1%') +
scale_y_continuous(limits=c(0, 20000)) +
theme_fivethirtyeight() +
theme(axis.title = element_text()) +
theme(plot.subtitle = element_text(size=10))
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 37 rows containing missing values (geom_smooth).
diamondsByClarity <- diamonds %>%
group_by(clarity) %>%
summarise(mean_price = mean(price),
median_price = median(price),
min_price = min(price),
max_price = max(price),
count = n()) %>%
arrange(clarity)
# By clarity and color separately
diamonds_mp_cla <- diamonds %>%
group_by(clarity) %>%
summarise(mean_price = mean(price))
diamonds_mp_col <- diamonds %>%
group_by(color) %>%
summarise(mean_price = mean(price))
# Using library(gridExtra)
plt1 <- ggplot(diamonds_mp_cla, aes(x=clarity, y=mean_price, fill=clarity)) +
geom_bar(stat = "identity", color='black') +
guides(fill=guide_legend(ncol=2))
plt2 <- ggplot(diamonds_mp_col, aes(x=color, y=mean_price, fill=color)) +
geom_bar(stat = "identity", color='black') +
guides(fill=guide_legend(ncol=2))
grid.arrange(plt1, plt2)
ggplot(diamonds, aes(x=table, y=price, color=cut)) +
geom_point(size=2.5) +
labs(title='Price vs Table',
x='table',
y='price') +
scale_x_continuous(breaks=seq(50, 80, 2),
limits=c(50, 80)) +
scale_color_brewer(palette = 'RdGy') +
theme_fivethirtyeight() +
theme(axis.title = element_text(), axis.text.x = element_text())
## Warning: Removed 5 rows containing missing values (geom_point).
# Add volume column
diamonds <- diamonds %>%
mutate(volume = x * y * z)
ggplot(subset(diamonds, volume <= quantile(volume, 0.99) & volume > 0), aes(x=volume, y=price, color=clarity)) +
geom_point(size=2.5) +
scale_y_log10(labels=dollar,
breaks=c(0, 1000, 10000)) +
scale_color_brewer(palette = 'BrBG') +
theme_minimal()
ggplot(diamonds, aes(x=cut, y=price/carat, color=color)) +
geom_jitter() +
facet_wrap(~clarity) +
scale_color_brewer(palette = 'BrBG')